
#
# SET-UP --------------------------------------------------------------------------------------------
#

rm(list=ls())

### Source the '00_global.R' script with required packages and functions
Sys.sleep(.5)
source(file.path(dirname(rstudioapi::getActiveDocumentContext()$path), '00_country_global.R'))


# Make a copy of the file
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)




# ' ------------------------------------------------------------------------------------------------------------------------------------------------------
# READ DATA --------------------------------------------------------------------------------------------------------------------------------------------
#

t0 = Sys.time() # record start time

country = read_flex(file.path(main_dir, 'data', 'clean', "country_full_final"), format = format1)
gc()
beep()

### xxx temp
country[, promocion_anyo_org_out := fifelse(
  promocion_anyo_org == 'no ascendido' & promocion_anyo == 'ascendido',
  'ascendido', 'no ascendido')]






#### leave unique org-anyo combinations -----------------------------------------------------------------------------------------------------------------------------------
# NOTE: we only want to keep org-year combinations that occur in the dataset after filtering those
# combinations with <10 unique IDs (see script 06). Otherwise the complete() function in the 
# summary function below will extend the values to combinations we don't want
org_anyo = unique(country[,.(anyo, entidad_nombre)])



### see if all variables that end up in the final 'country' dataset are recorded in the codebook
codebook = read_excel(file.path('data', 'clean', 'country_codebook.xlsx'), sheet=2)

names(country)[!names(country) %in% codebook$var_name_final]



### (*)checks > gender gap
## NOTE: the script at one point was feeding back the changes to the raw file and producing non-sensical and unstable 
## values. This is sanity check - gender gaps should be ~1.20-24 for all years (in favor of men)

### pay gap
country[, fmean(pago_bruto[genero == 'hombre'])/fmean(pago_bruto[genero == 'mujer']), by = .(anyo)]

### gender directivo gap (women should be ~40-44% of directivos)
country[,uniqueN(person_id[grado == "directivo" &  genero == 'mujer'])/
        uniqueN(person_id[grado == "directivo"]),
      by = .(anyo)]



# # checks > number of unique orgs should increase from ~670 in 2019 to ~940 in 2024
# org_anyo[,uniqueN(entidad_nombre), by=anyo]


### GDP-exp & inflation data ---------------------------------------------------------------------------------------------------------------------------------
gdp_exp = read_flex(file.path('data', 'clean', 'additional_data', 'all_gdp_expenditure'), format = 'xlsx') %>% clean_names() %>% 
  mutate(country = tolower(country),
         across(matches('_lcu|_usd|_alt|exp_nominal'), ~as.numeric(.))) %>%
  filter(country == 'country')


### inflation data
all_inflation = read_flex(file.path('data', 'clean', 'additional_data', 'all_inflation'), format = 'xlsx') %>% clean_names() %>% 
  mutate(country = tolower(country),
         across(matches('inflation'), ~as.numeric(.))) %>% 
  filter(country == 'country')
  






# ' ------------------------------------------------------------------------------------------------------------------------------------------------------
# SUMMARY FUNCTION  --------------------------------------------------------------------------------------------------------------------------------------------
#
gc()

read_instructions()

row_instructions = country_instructions$fila[27]
row_instructions = 30
country_instructions[country_instructions$fila == row_instructions,]

grupo_central = c('country') #  'sectorial_nombre', 'entidad_nombre') # 'entidad_nombre')
cubertura_type = 'max_cubertura' # max_cubertura   max_comparabilidad



dash_summary = function(row_instructions, # define row of the instructions file to use
                        cubertura_type = 'max_comparabilidad', #  max_cubertura (all orgs) vs. max_comparabilidad (only orgs in all years)
                        grupo_central = 'country' # we have 3 basic groupings - (1) cubertura (2) by sector (3) by entidad ->
                        # dta = country # define dataset to use (country name)
) {
  
  
  ### detach tidylog (if it's loaded) to limit messages displayed
  if('package:tidylog' %in% search()){detach('package:tidylog', unload = T)}
  
  ### define parameters
  # key required parameters
  indicador = country_instructions$indicador[country_instructions$fila == row_instructions]
  variable_tipo = country_instructions$variable_tipo[country_instructions$fila == row_instructions]
  anyo_inicio = country_instructions$anyo_inicio[country_instructions$fila == row_instructions]
  anyo_fin   = country_instructions$anyo_fin[country_instructions$fila == row_instructions]
  valor_tipo = country_instructions$valor_tipo[country_instructions$fila == row_instructions]
  zeros = country_instructions$zeros[country_instructions$fila == row_instructions]
  
  # optional parameters
  grupos_adicionales = strsplit(country_instructions$grupos_adicionales[country_instructions$fila == row_instructions], ",\\s*")[[1]]
  filtro = country_instructions$filtro[country_instructions$fila == row_instructions]
  # filtro = trimws(gsub("[\r\n]", " ", country_instructions$filtro[country_instructions$fila == row_instructions]))
  
  
  ### fixed values? (same values regardless of user choices in the dashboard)
  # for any indicator grouping by 1) sector 2) organization 3) region 4) key position
  if(any(grepl('sectorial|entidad|region|position', grupos_adicionales))){
    valor_tipo = paste0(valor_tipo, ', fixed')
  }


  dta = country
  
  ### extract which years covered and how directly from indicator name? XXXX
  # e.g por ano = values for all years, promedio = average of values, one year = only this year
  # + less chance of mistakenly specifying years  || - extra coding, easier to edit years in Excel, looks cleaner to have them separately
  # anyos = extract_parentheses(country_instructions$indicador[i])
  
  
  
  ### check arguments passed to the function -------------------------------------------------------------------------------------------
  
  # check if all key parameters defined, if not, break
  if(any(grepl('xxx', c(indicador, variable_tipo, anyo_inicio, anyo_fin, valor_tipo), ignore.case = T))){
    stop('FUNCTION DISRUPTED: ONE OF REQUIRED PARAMETERS REMAINS UNDEFINED (CONTAINS XXX)')
  }
  
  
  # if any optional parameters undefined (xxx), ensure there are treated as boolean FALSE
  if(any(grepl('xxx', grupos_adicionales))){grupos_adicionales = F}
  if(grepl('xxx', filtro)){filtro = F}
  

  
  #### remove columns that are not requested   -----------------------------------------------------------------------------------------------
  dta = dta %>% dplyr::select(any_of(c(
    'cubertura',
    grupo_central, 'anyo',  # 'coverage', # standard grouping variables
    grupos_adicionales, # additional grouping variables (if any)
    'fte', # keep fte (for weigting)
    variable_tipo, valor_tipo,  # variables defining the outcome
    ifelse(grepl('share_of_pago', valor_tipo), 'pago_bruto', 'xxx') # also keep pago_bruto if the outcome is any type of extra payment
  )))
  
  
  
  
  #### dynamic outcome variable  -----------------------------------------------------------------------------------------------
  dta = dta %>% mutate(outcome_var = .data[[variable_tipo]]) # if neither a pay variable nor fte, don't mutate
  
  
  #### dynamic grouping ----------------------------------------------------------------------------------------
  grupo_all <- c()
  if (!is.null(grupo_central) & !grepl('fixed', valor_tipo)) grupo_all <- c(grupo_all, grupo_central)
  if (grepl('fixed', valor_tipo)) grupo_all <- c(grupo_all, 'country')
  if (!any(grupos_adicionales == F | grupos_adicionales == 'xxx')) grupo_all <- unique(c(grupo_all, grupos_adicionales))
  
  #if grouping by organization, then don't complete missing cases by sector, otherwise, each organization will get listed under each sector
  if (any(grepl('entidad_nombre', grupo_all)) & 
      any(grepl('sectorial_nombre', grupo_all))){grupo_all <- grupo_all[-which(grupo_all %in% c('sectorial_nombre'))]}
  
  
  ### existing groups ----------------------------------------------------------------------------------------------------------------  
  grupo_exist = grupo_all
  if(length(grupos_adicionales) > 1 & (grupos_adicionales[2] != 'genero')){grupo_exist = grupo_exist[-(length(grupo_exist))]}
  if(any(grepl('leaver_|hire_|promocion_', grupo_exist))){
    grupo_exist = grupo_exist[-which(grepl('leaver_|hire_|promocion_', grupo_exist))]
  }
  
  existing_groups = dta %>%
    select(any_of(c('anyo', grupo_exist))) %>% 
    distinct() %>%
    mutate(group_exists = TRUE)
  
  
  
  
  # filter by year and cubertura, if needed -------------------------------------------------------------------------------------------------------------------------------------------
  # based on arguments either keep or remove organizations without full coverage
  if(anyo_inicio == 2024){
    dta <- dta[anyo == 2024]
  }
  if(cubertura_type == 'max_comparabilidad'){ # max_cubertura (all orgs) vs. max_comparabilidad (only orgs in all years)
    dta <- dta[cubertura == 'completo']
  }
  
  if(cubertura_type == 'max_comparabilidad' & anyo_inicio == 2024){
    dta <- dta[cubertura == 'completo' & anyo == 2024]
  }
  
  
 
  
  
  ## filter NAs ------------------------------------------------------------------------------------------------------------------------------
  dta <- dta[complete.cases(dta[, .SD, .SDcols = c('cubertura',"anyo", grupo_all)])]
  dta[, cubertura := NULL]    
  

  
  ### > CALCULATE VALUES (dplyr) --------------------------------------------------------------------------------------------------


  
  ## fast grouping -> NOTE: unless dta created as deep copy of country, this will mess up the original 'country' dataset
  # setkeyv(dta, c("anyo", grupo_all))  

  
  # sum up pago_bruto first (for 'composicion salarial' variables)
  if(grepl('share_of_pago', valor_tipo)){ # also keep pago_bruto if the outcome is any type of extra payment
    dta_pago_bruto =  dta %>% 
      # group_by(across(all_of(c("anyo", grupo_all)))) %>% # NOTE: group_by() requires the use of across(), fgroup_by accepts character vectors directly
      fgroup_by(c("anyo", grupo_all)) %>% # 
      fsummarise(pago_bruto = fsum(pago_bruto, na.rm=T), fte_sum = sum_miss(fte))
  }

  
  dta = dta %>% 
    
    fgroup_by(c("anyo", grupo_all)) %>% # group by year and any other variables as defined above (note that
    # for a few indicators where monthly values are needed 'month' is a separate
    # variable specified in 'grupos_adicionales' as needed)
    
    # filter(n() >= 10) %>% # filter if fewer than X respondents
    
    fsummarise( # type of values we want, depending on 'valor_tipo' argument
      fte_sum = sum_miss(fte),
      value = 
        if    (grepl('^sum', valor_tipo)) sum_miss(outcome_var) # calculate a sum (we 'weight' it above by using *fte*12)
        else if (grepl('^mean', valor_tipo)) weighted.mean(outcome_var, fte, na.rm=T) # calculate a mean (weighted) - if someone earns salary X for 6 months
        else if (grepl('^median', valor_tipo)) Hmisc::wtd.quantile(outcome_var, weights = fte, probs=0.5, na.rm=T) # calculate a mean (weighted) - if someone earns salary X for 6 months
        # this should be discounted relative to someone earning X for 12 months (imagine we had monthly values - the first person would have only 6 entries,
        # the second 12 entries). For monthly data, all FTE = 1/12, so weighting doesn't matter
        else if (grepl('^person_id|^puesto_nombre', valor_tipo)) fdistinct(outcome_var) # count unique IDs/names
        else if (grepl('^employees', valor_tipo)) fte_sum # N of Full-Time Equivalents (employee-months) - divide by 12, unless we are also grouping by month
        # else if (valor_tipo %in% c("ratio of pago_total")) sum_miss(outcome_var) / sum_miss(pago_total)
        else if (grepl('multiplier10', valor_tipo)) weighted.mean(outcome_var[outcome_var >= quantile(outcome_var, 0.9, na.rm=T)], # take mean outcome for top 10%
                                                                  fte[outcome_var >= quantile(outcome_var, 0.9, na.rm=T)], na.rm=T)/
          weighted.mean(outcome_var[outcome_var >= quantile(outcome_var, 0.1, na.rm=T)], # divide by mean outcome of bottom 10%
                        fte[outcome_var >= quantile(outcome_var, 0.1, na.rm=T)], na.rm=T)
        
        else if (grepl('multiplier20', valor_tipo)) weighted.mean(outcome_var[outcome_var >= quantile(outcome_var, 0.8, na.rm=T)], # as above for 20%
                                                                  fte[outcome_var >= quantile(outcome_var, 0.8, na.rm=T)], na.rm=T)/
          weighted.mean(outcome_var[outcome_var >= quantile(outcome_var, 0.2, na.rm=T)],
                        fte[outcome_var >= quantile(outcome_var, 0.2, na.rm=T)], na.rm=T)     
        else NA_real_) %>%
    complete(anyo, !!!syms(grupo_all), fill = list(fte_sum = NA, value = 0)) %>% 
    ungroup()


  
  ### filter org-anyo ----------------------------------------------------------------------------------------------------------
  # print(paste0('BEFORE: ', nrowdta))) # control how many rows before
  
  
  if(any(grepl('entidad_nombre', grupo_all))){
    dta = org_anyo[dta, on = .(anyo, entidad_nombre), nomatch = 0]
  }
  
  
  ### filter 0s ----------------------------------------------------------------------------------------------------------
  if(zeros == 'no'){
    dta$value = ifelse(is.na(dta$fte_sum), NA, dta$value)
  }
  
  
  ### filter small groups --------------------------------------------------------------------------------
  dta = dta %>%
    # group_by(across(all_of(c("anyo", grupo_all[grupo_all %in% c('sectorial_nombre', 'entidad_nombre')])))) %>%
    # filter(!any(fte_sum <= min_size)) %>%
    # filter(sum(fte_sum) >= min_size) %>%
    # select(-c(any_of('fte_sum'))) %>% # remove fte since we no longer need it?
    ungroup

  # print(paste0('AFTER: ', nrow(dta))) # control how many rows after
  
  
  
  ### ADDITIONAL OPERATIONS   -----------------------------------------------------------------------------------------------
  if(!grepl(', ', valor_tipo)){ # if 'valor_tipo' contains a comma, it means there are additional operations to conduct
    # print('No additional operations')
  }

  if(!(anyo_inicio %in% c(2019,2024))){ # if first and last years different than 2019 and 2024, 
    # filter accordingly (quicker todo  here than on the non-aggregated dataset)
    dta = dta[dta$anyo >= anyo_inicio, ]
    dta = dta[dta$anyo <= anyo_fin, ]
  }
  

  ### if need real (inflation-adjusted values) - mulitply by the right value
  # (much faster to do here than to multiply all entries for all pay types)
  # NOTE: Needs to be done before 'year-to-year' adjustments below
  if(grepl('inflacion', valor_tipo)){
    dta = left_join(dta, all_inflation %>% select(c(anyo, inflation_aggregate_2024))) %>% 
      mutate(value = value * inflation_aggregate_2024) %>% 
      select(-c(inflation_aggregate_2024))
  }
  

  if(grepl('share_gdp|share_exp', valor_tipo)){
    
    dta$anyo = as.numeric(dta$anyo)
    gdp_exp$anyo = as.numeric(gdp_exp$anyo)
    
    dta = dta %>% left_join(., gdp_exp, by = c('anyo', 'country'))
    
    if(grepl('share_gdp', valor_tipo)){dta$value = dta$value / (10^12 * as.numeric(dta$gdp_lcu2))}
    if(grepl('share_exp', valor_tipo)){dta$value = dta$value /  (10^3 * as.numeric(dta$exp_nominal))}
    
    dta = dta %>% dplyr::select(-c(matches('^exchange|gdp_|^exp_'))) # de-select last 2 columns (exchange and inflation rate)
    
  }
  
  

  if(grepl('share_of_pago', valor_tipo)){
    dta = left_join(dta, dta_pago_bruto %>% select(-c(any_of(c('fte_sum'))))) %>% 
      mutate(value=value/pago_bruto) %>% select(-any_of(c('pago_bruto')))
  }

  
  # if we want shares of total
  if(grepl('share_of_total', valor_tipo)){
    
    dta = dta %>% ungroup() %>%
      group_by(across(all_of(c("anyo", grupo_all[-length(grupo_all)])))) %>% # groups for which we want values -> all but the last one
      # as this excludes the category by which we DON'T WANT to group here (e.g. % of contracts by organization -> keeps only organization)
      # filter(fdistinct(!!sym(grupos_adicionales[1])) == n1) %>%  # remove groups when <total number of groups is present
      mutate(value = value / sum_miss(value), # calculate shares of total
             # fte_sum = fte_sum / sum(fte_sum)
      ) %>% ungroup()
  }
  
  
  ### if we want gender gap 
  if(grepl('gender_gap', valor_tipo)){
    
    dta = dta %>% 
      filter(!is.na(genero)) %>% 
      filter(if_all(all_of(c("anyo", grupo_all)), ~ !is.na(.))) %>%
      
      group_by(across(all_of(c('anyo', grupo_all[-which(grupo_all == 'genero')])))) %>% 
      filter(fdistinct(genero) == 2) %>% 
      mutate(value = value[genero == 'hombre']/value[genero == 'mujer'] - 1) %>% 
      ungroup %>% 
      filter(genero == 'mujer') %>% 
      select(-c(genero)) %>%
      distinct
    
    grupo_all = grupo_all[-which(grupo_all == 'genero')]
    
  }
  
  ### if we want ratio of directors 
  if(grepl('directivo_ratio', valor_tipo)){
    
    dta = dta %>% ungroup() %>% 
      filter(!is.na(grado_directivo_anyo)) %>% 
      filter(if_all(all_of(c("anyo", grupo_all)), ~ !is.na(.))) %>%
      
      group_by(across(all_of(c('anyo', grupo_all[-which(grupo_all == 'grado_directivo_anyo')])))) %>% 
      filter(fdistinct(grado_directivo_anyo) == 2) %>% 
      
      mutate(value = value[grado_directivo_anyo == 'no directivo o gerente']/value[grado_directivo_anyo == 'directivo o gerente'],
             value = ifelse(is.infinite(value), NA, value)) %>% 
      
      ungroup %>% 
      filter(grado_directivo_anyo == 'directivo o gerente') %>% 
      # select(-c(grado_directivo_anyo)) %>% 
      distinct
    
    grupo_all = grupo_all[-which(grupo_all == 'grado_directivo_anyo')]

  }
  
  

  
  if(grepl('cumulative_5f', valor_tipo)){
    dta = dta %>%
      filter(fecha_jubilacion_anyo %in% (((as.numeric(anyo_fin) + 1)):((as.numeric(anyo_fin) + 5)))) %>% 
      # select(-c(fecha_jubilacion_anyo)) %>% 
      group_by(across(-c(value, fecha_jubilacion_anyo))) %>% 
      summarise(value = cumsum(value),
                fte_sum = sum_miss(fte_sum))
    
    # grupo_all = grupo_all[-which(grupo_all == 'fecha_jubilacion_anyo')]
    
  }
  
  
  
  # to calculate year-to-year (or month-to-month) change - either to base year/month set to 100
  # or just as % of the previous value
  # NEEDS TO BE IN THIS PLACE, NOT BEFORE OR AFTER OTHER CONDITIONAL STATMENTS
  
  if(grepl('year_to_base_year', valor_tipo)){
    
    dta = dta %>% 
      group_by(across(all_of(c(grupo_all)))) %>%
      filter(is.finite(min(anyo[value > 0]))) %>% 
      filter(max(anyo_fin) >= min(anyo[value > 0])) %>% 
      arrange(anyo) %>% 
      mutate(
        value = ifelse(anyo == min(anyo[value > 0]), 100, 100*value/value[anyo == min(anyo[value>0])]), # assing 100 to the first non-missing year and then proportion of the first yea thereafter
      ) %>% 
      ungroup()
  }
  
  if(grepl('year_to_year', valor_tipo)){
    
    dta = dta %>% 
      group_by(across(all_of(c(grupo_all)))) %>%
      filter(is.finite(min(anyo[value > 0]))) %>% 
      filter(max(anyo_fin) >= min(anyo[value > 0])) %>% 
      arrange(anyo) %>% 
      mutate(
        value = ifelse(anyo == min(anyo[value > 0]), 100, 100*value/lag(value)), # assing 100 to the first non-missing year and then proportion of previous thereafter
      ) %>% 
      ungroup()
  }
  
  if(grepl('month_to_month', valor_tipo)){
    
    dta = dta %>% 
      group_by(across(all_of(c(grupo_all[-which(grupo_all %in% c('mes', 'anyo') )])))) %>%
      filter(is.finite(min(mes[value > 0]))) %>% 
      arrange(anyo, mes) %>% 
      mutate(
        value = ifelse(row_number() == 1, 100, 100*value/lag(value)), # assign 100 to the first non-missing year and then proportion thereafter
      ) %>% 
      ungroup()
  }
  
  
  
  ### if we want average across years (anyo columns marked with 'promedio' instead of year number)
  # NEEDS TO BE AFTER ALL OTHER CONDITIONS APART FROM 'FIXED'
  if(grepl('aggregate', valor_tipo)){
    dta = dta   %>% 
      filter(anyo != anyo_inicio) %>%  # otherwise the baseline year is included
      mutate(anyo = paste0('promedio (', anyo_inicio, '-', anyo_fin, ')')) %>% 
      group_by(across(all_of(c("anyo", grupo_all)))) %>% 
      summarize(
        value = mean_miss(value),
        fte_sum = sum_miss(fte_sum)
        # fte_sum = mean_miss(fte_sum)
      )
  }
  
  
  # if fixed values needed, i.e. disregarding user selections, add expansion across all possible
  # combinations of core grouping variables, which will replicate the values for whatever level is 
  # not there (so e.g. each sector will give the same value regardless of the organization selected)
  if(grepl('fixed', valor_tipo)){
    expand1 = unique(country[, .(country, sectorial_nombre, entidad_nombre)])
    dta = full_join(expand1, dta)
  }
  
  
  ### substantive 0s ---------------------------------------------------------------------------------------------------------
  # (pr_na(dta$fte_sum == 0))
  # (pr_na(dta$value == 0))
  # 
  
  if(zeros != 'no'){
    dta = left_join(dta %>% mutate(anyo=as.character(anyo)), 
                    existing_groups %>% mutate(anyo=as.character(anyo))) %>%
      ungroup() %>% 
      mutate(fte_sum = ifelse((group_exists & is.na(fte_sum)), 0, fte_sum)) %>% 
      select(-c(group_exists))
    
      
  }

  # pr_na(dta2$fte_sum == 0)
  
  
  ### FILTER SUB-GROUPS -----------------------------------------------------------------------------------------------------
  if(grepl(paste0('\\b', 'genero_mujer', '\\b'),   filtro)){dta = dta %>% filter(genero == 'mujer') %>% select(-c(genero))}
  
  if(grepl(paste0('\\b', 'hire_anyo', '\\b'),   filtro)){dta = dta %>% filter(hire_anyo == 'nuevo contratado') %>% select(-c(hire_anyo))}
  if(grepl(paste0('\\b', 'leaver_anyo', '\\b'), filtro)){dta = dta %>% filter(leaver_anyo == 'egrasado') %>% select(-c(leaver_anyo))}
  if(grepl(paste0('\\b', 'hire_leaver_anyo', '\\b'), filtro)){dta = dta %>% filter(hire_leaver_anyo == 'nuevo o egrasado') %>% select(-c(hire_leaver_anyo))}
  if(grepl(paste0('\\b', 'no_hire_leaver_anyo', '\\b'), filtro)){dta = dta %>% filter(hire_leaver_anyo == 'personal existente') %>% select(-c(hire_leaver_anyo))}
  
  if(grepl(paste0('\\b', 'hire_anyo_org', '\\b'),   filtro)){dta = dta %>% filter(hire_anyo_org == 'nuevo contratado') %>% select(-c(hire_anyo_org))}
  if(grepl(paste0('\\b', 'leaver_anyo_org', '\\b'), filtro)){dta = dta %>% filter(leaver_anyo_org == 'egrasado') %>% select(-c(leaver_anyo_org))}
  if(grepl(paste0('\\b', 'hire_leaver_anyo_org', '\\b'), filtro)){dta = dta %>% filter(hire_leaver_anyo_org == 'nuevo o egrasado') %>% select(-c(hire_leaver_anyo_org))}
  
  if(grepl(paste0('\\b', 'hire_anyo_org_only', '\\b'),   filtro)){dta = dta %>% filter(hire_anyo_org_only == 'nuevo contratado') %>% select(-c(hire_anyo_org_only))}
  if(grepl(paste0('\\b', 'leaver_anyo_org_only', '\\b'), filtro)){dta = dta %>% filter(leaver_anyo_org_only == 'egrasado') %>% select(-c(leaver_anyo_org_only))}
  if(grepl(paste0('\\b', 'hire_leaver_anyo_org_only', '\\b'), filtro)){dta = dta %>% filter(hire_leaver_anyo_org_only == 'nuevo o egrasado') %>% select(-c(hire_leaver_anyo_org_only))}
  
  if(grepl(paste0('\\b', 'id_appear', '\\b'), filtro)){dta = dta %>% filter(id_appear == 'nuevo contratado' | id_appear == 1) %>% select(-c(id_appear))}
  if(grepl(paste0('\\b', 'id_disappear', '\\b'), filtro)){dta = dta %>% filter(id_disappear == 'egrasado' | id_disappear == 1) %>% select(-c(id_disappear))}
  
  if(grepl(paste0('\\b', 'contract_type_temporario', '\\b'), filtro)){dta = dta %>% filter(contract_type_dummy  == 'temporario') %>% select(-c(contract_type_dummy ))}
  if(grepl(paste0('\\b', 'contract_type_permanente', '\\b'), filtro)){dta = dta %>% filter(contract_type_dummy  == 'permanente') %>% select(-c(contract_type_dummy ))}
  
  if(grepl(paste0('\\b', 'grado_directivo', '\\b'),  filtro)){dta = dta %>% filter(grado_directivo == 'directivo o gerente') %>% select(-c(grado_directivo))}
  if(grepl(paste0('\\b', 'grado_directivo_anyo', '\\b'),  filtro)){dta = dta %>% filter(grado_directivo_anyo == 'directivo o gerente') %>% select(-c(grado_directivo_anyo))}
  
  if(grepl(paste0('\\b', 'promocion_anyo', '\\b'),  filtro)){dta = dta %>% filter(promocion_anyo == 'ascendido') %>% select(-c(promocion_anyo))}
  if(grepl(paste0('\\b', 'promocion_anyo_org', '\\b'),  filtro)){dta = dta %>% filter(promocion_anyo_org == 'ascendido') %>% select(-c(promocion_anyo_org))}
  if(grepl(paste0('\\b', 'promocion_anyo_org_out', '\\b'),  filtro)){dta = dta %>% filter(promocion_anyo_org_out == 'ascendido') %>% select(-c(promocion_anyo_org_out))}
  if(grepl(paste0('\\b', 'promocion_directivo_anyo', '\\b'),  filtro)){dta = dta %>% filter(promocion_directivo_anyo == 'ascendido') %>% select(-c(promocion_directivo_anyo))}
  if(grepl(paste0('\\b', 'promocion_directivo_anyo_org', '\\b'),  filtro)){dta = dta %>% filter(promocion_directivo_anyo_org == 'ascendido') %>% select(-c(promocion_directivo_anyo_org))}
  
  if(grepl(paste0('\\b', 'multi_anyo', '\\b'),  filtro)){dta = dta %>% filter(multi_anyo == 'pago múltiple en al menos un mes') %>% select(-c(multi_anyo))}
  

    
  # if(grepl(paste0('\\b', 'fecha_jubilacion_anyo', '\\b'),  filtro)){dta = dta %>% 
  #   filter(fecha_jubilacion_anyo %in% (((as.numeric(anyo_fin) + 1)):((as.numeric(anyo_fin) + 5))))}
  # 
  # if(grepl(paste0('\\b', 'fecha_jubilacion_alcanzado_dummy', '\\b'),  filtro)){dta = dta %>% filter(fecha_jubilacion_alcanzado_dummy == 'post edad jubilacion') %>% select(-c(fecha_jubilacion_alcanzado_dummy))}
  
  ### adjust columns to make for a clean entry to the dashboard without (much) further edits
  # NOTE: this needs to be done after filtering above, as the filtering uses uncleaned column names
  

  ### final cleaning  -----------------------------------------------------------------------------------------------------------------------------------------------
  
  # we want year/anyo + 3 core groups (coverage, sectorial, and organization) + value (and possibly fte_sum) to be always present
  # but we don't want separate columns for each additional grouping (e.g. separate 'region' or 'contract_type' columns),
  # so rename those groupings as group_xx 
  non_matching <- setdiff(names(dta), c('anyo', 'country', 'sectorial_nombre', 'entidad_nombre',
                                        'fte_sum', 'value'))
  if(length(non_matching) > 0){
    
    new_names <- paste0("group_", seq_along(non_matching))
    
    dta = dta %>%
      rename_with(
        .fn = ~ setNames(new_names, non_matching)[.],
        .cols = all_of(non_matching))
  }
  
  
  
  dta = dta %>%
    filter(!is.na(value) & !is.infinite(value)) %>% # remove NA and Inf values, if any
    mutate(anyo = as.numeric(anyo)) %>% 
    add_column(indicador = indicador, .before = 1 ) %>% # add indicator name
    add_column(cubertura = ifelse(cubertura_type == 'max_cubertura',
                                  'Maximizar cubertura', 'Maximizar comparabilidad'), .after = 'country') %>% # add coverage type
    # mutate(cubertura = ifelse(cubertura_type == 'max_cubertura', 'Maximizar cubertura', 'Maximizar comparabilidad a lo largo del tiempo')) %>%  # full descriptive coverage variable names
    distinct %>% # leave only unique values (needed when forcing the grouping by additional core variables, e.g. always add entidad grouping)
    { if ("anyo" %in% names(.)) arrange(., anyo) else . } # organize by year (if present)
  
  
  # check if the data.frame not empty, if so, return an error
  if(nrow(dta) == 0){
    stop('FUNCTION DISRUPTED: RESULTING DATAFRAME HAS 0 ROWS')
  }
  
  ### print values to control the function's behaviour?
  # print(indicador) # name of the indicator being calculated
  # print(head(dta, n=8)) # first N rows of the final df
  print(summary(dta$value)) # summary of the values obtained
  
  #### return final df -----------------------------------------------------------------------------------------
  return(dta)
  
  
}







# ' ---------------------------------------------------------------------------------------------------
# LOOP ---------------------------------------------------------------------------------------------------
# 

# read or update instructions to the newest version before running the loop
read_instructions()


beep('complete')

### run loop....
start_row = 1 # start from 1st row to run the full loop, start in the middle if some rows already done or when testing
row_instructions = 1
# seccion1='Rotación'



# for(seccion1 in unique(country_instructions$seccion)){ # when running for all sections
for(seccion1 in unique(country_instructions$seccion)[3:fdistinct(country_instructions$seccion)]){ # when we want to select sections to run for
  
  count = 0
  for(row_instructions in unique(country_instructions$fila[country_instructions$seccion == seccion1])){
    
     
    # skip any? (if already done or when testing)
    if(as.numeric(row_instructions) < start_row){next} # anything before the first row
    if(as.numeric(row_instructions) >= 900){next} # rows 900+ are there (at least for now) only for specific figures and as such don't 
    # form part of the final dashboard dataset. Their numbering starts from 900 (actual dashboard indicators are <200), so they can
    # be cleanly 'cut off' in the loop here

    # control loop
    print(paste0(row_instructions, ')  ',
                 country_instructions$indicador[country_instructions$fila == row_instructions]))
    cat('\n\n')
    
    
    # detach tidylog if needed (too much output printed otherwise)
    if ("package:tidylog" %in% search()) {
      detach("package:tidylog", unload = TRUE, character.only = TRUE)
    }
    
    
    ### apply function -------------------------------------------------------------------------------------------------------------------------------
    ### NOTE: apply the dashboard summary function to all desired groupings - full vs partial org coverage (max_cubertura vs max_comparabilidad) both overall, 
    ####and across sectors, and across individual organizations
    
    temp = rbindlist(list(
      dash_summary(row_instructions, cubertura_type = 'max_cubertura', grupo_central = 'country'),
      dash_summary(row_instructions, cubertura_type = 'max_comparabilidad', grupo_central = 'country'),
      dash_summary(row_instructions, cubertura_type = 'max_cubertura', grupo_central = c('country', 'sectorial_nombre')),
      dash_summary(row_instructions, cubertura_type = 'max_comparabilidad', grupo_central = c('country', 'sectorial_nombre')),
      dash_summary(row_instructions, cubertura_type = 'max_cubertura', c('country', 'sectorial_nombre', 'entidad_nombre')),
      dash_summary(row_instructions, cubertura_type = 'max_comparabilidad', c('country', 'sectorial_nombre', 'entidad_nombre'))
    ),
    use.names = T, fill = T) %>% 
      relocate(value, .after = last_col()) %>% 
      distinct
    
    
    # add to the full dataset
    count = count + 1
    if(count == 1 | !exists('dta_all')){ # unless first iteration (or dta_all non-existent), then create the dataset
      dta_all = temp
    }else{
      dta_all = rbindlist(list(dta_all, temp), use.names = T, fill = T)
    }
    
    
    
    ### control values?  
    print(tail(funique(dta_all$indicador), n = 10)) # list of unique indicators (last N to avoid cluttering the console)
    print(fdistinct(dta_all$indicador)) # number of unique indicators
    print(dim(dta_all)) # dimensions of the dataset
    
    
   

    
    ### save cleanly by section 
    if(row_instructions == max(unique(country_instructions$fila[country_instructions$seccion == seccion1]))){ # once section done
    
      dta_all2 = dta_all # unnecessary, but stores dta_all2 data before final cleaning, in case any edits need to be done

      dta_all = dta_all2
      
      ### sentence-case across grouping columns
      if(!'group_1' %in% names(dta_all)){dta_all$group_1 = NA}
      if(!'group_2' %in% names(dta_all)){dta_all$group_2 = NA}
      
      dta_all = dta_all %>% mutate(across(any_of(c('country', 'cubertura', 'sectorial_nombre', 'group_1', 'group_2')), ~str_to_sentence(.)),
                                   entidad_nombre = title_case_spanish(entidad_nombre))
      
      
      ### assign 'Todas las organizaciones' to missing core columns (cubertura, sectorila, entidad)
      dta_all = dta_all %>% ungroup %>% mutate(across(c(cubertura, sectorial_nombre, entidad_nombre), ~replace_na(.x, "Todas las organizaciones")))
      
      
      ## re-name columns
      dta_all = dta_all %>% rename(
        tipo_organizacion = sectorial_nombre,
        organizacion = entidad_nombre
      )
      
      ### re-arrange columns
      dta_all = dta_all %>%  relocate(value, .after = last_col())
      
      
      ### dates (temp?)
      dta_all = dta_all %>% 
        mutate(across(where(lubridate::is.instant), as.character)) %>% # any date-like column to characters (easier to handle thereafter)
        mutate(anyo = ifelse(grepl('promedio de 2019-2024', indicador), '2019-2024', anyo)) %>% # change promedio de 2019-2024 to 2019-2024
        distinct()
 
      
      ### save
      fwrite(dta_all %>%  distinct, 
             file = file.path(main_dir, 'data', 'clean','00_database',
                                  paste0("Country database", 
                                         format(Sys.time(), " %d%m%Y"),
                                         # format(Sys.time(), "%d_%m_%Y (%H_%M)"), 
                                         ' (', clean_text(seccion1), ')', '.csv')), 
             na = NA, row.names = F, encoding = 'UTF-8')
      
      ### save (copy) 
      fwrite(x = dta_all %>%  distinct, 
            file = file.path(main_dir, 'data', 'clean','00_database',
                                  paste0("Country database", 
                                         format(Sys.time(), " %d%m%Y"),
                                         # format(Sys.time(), "%d_%m_%Y (%H_%M)"), 
                                         ' (', clean_text(seccion1), ') - raw copy', '.csv')), 
            na = NA, row.names = F, encoding = 'UTF-8')
      
      
      rm(dta_all) # remove the object to ensure it's properly created from scratch for each new section and doesn't occupy space

    }
  }
}


exec_time_fun('exec_time')



# ' ------------------------------------------------------------------------------------------------------------------------------------------------------
# END OF CODE  --------------------------------------------------------------------------------------------------------------------------------------------
#